其他
customize 你的 GSEA 图
我是如此相信
1引言
参考前面 Y 叔 enrichplot 包中的 gseaplot2
代码,我们提取数据自己来一步步画图,比如做一些图形的延伸或者个性化绘图。在这里感谢 Y 叔的贡献及代码。
今天来拿自己的数据来一步步绘制常见的图形和一些不一样的 GSEA 图形。
2准备基因集
msigdbr 这个包方便我们直接下载对应的基因集:
# install.packages("msigdbr")
library(tidyverse)
library(msigdbr)
library(clusterProfiler)
library(org.Mm.eg.db)
library(enrichplot)
library(RColorBrewer)
library(ggrepel)
library(ggplot2)
library(aplot)
# 查看msigdbr包支持的物种
msigdbr_species()
# # A tibble: 20 x 2
# species_name species_common_name
# <chr> <chr>
# 1 Anolis carolinensis Carolina anole, green anole
# 2 Bos taurus bovine, cattle, cow, dairy cow, domestic cattle, domestic cow, o~
# 3 Caenorhabditis elegans NA
# 4 Canis lupus familiaris dog, dogs
# 5 Danio rerio leopard danio, zebra danio, zebra fish, zebrafish
# 6 Drosophila melanogaster fruit fly
# 7 Equus caballus domestic horse, equine, horse
# 8 Felis catus cat, cats, domestic cat
# 9 Gallus gallus bantam, chicken, chickens, Gallus domesticus
# 10 Homo sapiens human
# 11 Macaca mulatta rhesus macaque, rhesus macaques, Rhesus monkey, rhesus monkeys
# 12 Monodelphis domestica gray short-tailed opossum
# 13 Mus musculus house mouse, mouse
# 14 Ornithorhynchus anatinus duck-billed platypus, duckbill platypus, platypus
# 15 Pan troglodytes chimpanzee
# 16 Rattus norvegicus brown rat, Norway rat, rat, rats
# 17 Saccharomyces cerevisiae baker's yeast, brewer's yeast, S. cerevisiae
# 18 Schizosaccharomyces pombe 972h- NA
# 19 Sus scrofa pig, pigs, swine, wild boar
# 20 Xenopus tropicalis tropical clawed frog, western clawed frog
所有基因集:
下载:
我这里下载了小鼠
的生物过程
基因集:
# choose C5 BP gene sets
genesets <- msigdbr(species = "Mus musculus",
category = "C5",
subcategory = "BP")
# check
head(genesets,3)
# # A tibble: 3 x 18
# gs_cat gs_subcat gs_name gene_symbol entrez_gene ensembl_gene human_gene_symb~ human_entrez_ge~
# <chr> <chr> <chr> <chr> <int> <chr> <chr> <int>
# 1 C5 GO:BP GOBP_10_F~ Aasdhppt 67618 ENSMUSG0000~ AASDHPPT 60496
# 2 C5 GO:BP GOBP_10_F~ Aldh1l1 107747 ENSMUSG0000~ ALDH1L1 10840
# 3 C5 GO:BP GOBP_10_F~ Aldh1l2 216188 ENSMUSG0000~ ALDH1L2 160428
# # ... with 10 more variables: human_ensembl_gene <chr>, gs_id <chr>, gs_pmid <chr>, gs_geoid <chr>,
# # gs_exact_source <chr>, gs_url <chr>, gs_description <chr>, taxon_id <int>,
# # ortholog_sources <chr>, num_ortholog_sources <dbl>
# TERM2GENE
gmt <- genesets %>% dplyr::select(gs_name,gene_symbol)
3读取基因
导入我们的基因及对应的差异倍数:
# TERM2GENE
gmt <- genesets %>% dplyr::select(gs_name,gene_symbol)
# load gene
gene <- read.table('gsea-test-data.txt',header = T) %>%
arrange(desc(log2FoldChange))
# check
head(gene,3)
# gene_name log2FoldChange
# 1 Ecscr 6.015527
# 2 Gm32341 5.962540
# 3 B130034C11Rik 5.841789
geneList <- gene$log2FoldChange
names(geneList) <- gene$gene_name
# check
head(geneList,3)
# Ecscr Gm32341 B130034C11Rik
# 6.015527 5.962540 5.841789
4GSEA 富集分析
因为使用的是自己准备的通路集合,所以这里拿的是 GSEA 函数进行富集分析:
# GO enrich
gseaRes <- GSEA(geneList = geneList,
TERM2GENE = gmt,
minGSSize = 10,
maxGSSize = 500,
pvalueCutoff = 1,
pAdjustMethod = "BH",
verbose = FALSE)
# to dataframe
data_ga <- data.frame(gseaRes) %>%
filter(pvalue < 0.05)
画个图看看:
# gseaplot2
gseaplot2(gseaRes, geneSetID = 'GOBP_NUCLEOSIDE_DIPHOSPHATE_METABOLIC_PROCESS',
title = gseaRes$Description['GOBP_NUCLEOSIDE_DIPHOSPHATE_METABOLIC_PROCESS'])
5提取数据
参考之前的,我们先把这个通路对应的基因排序数据提取出来:
gseaScores <- getFromNamespace("gseaScores", "DOSE")
# define function
gsInfo <- function(object, geneSetID) {
geneList <- object@geneList
if (is.numeric(geneSetID))
geneSetID <- object@result[geneSetID, "ID"]
geneSet <- object@geneSets[[geneSetID]]
exponent <- object@params[["exponent"]]
df <- gseaScores(geneList, geneSet, exponent, fortify=TRUE)
df$ymin <- 0
df$ymax <- 0
pos <- df$position == 1
h <- diff(range(df$runningScore))/20
df$ymin[pos] <- -h
df$ymax[pos] <- h
df$geneList <- geneList
df$Description <- object@result[geneSetID, "Description"]
return(df)
}
提取:
# get data
gsdata <- gsInfo(gseaRes, geneSetID = 'GOBP_NUCLEOSIDE_DIPHOSPHATE_METABOLIC_PROCESS')
gsdata1 <- gsdata %>%
mutate("gene_name" = gene$gene_name) %>%
filter(position == 1)
# check
head(gsdata1,3)
# x runningScore position ymin ymax geneList Description gene_name
# 1 4 0.05996027 1 -0.02241342 0.02241342 5.803995 GOBP_NUCLEOSIDE_DIPHOSPHATE_METABOLIC_PROCESS Hkdc1
# 2 29 0.11168581 1 -0.02241342 0.02241342 5.081191 GOBP_NUCLEOSIDE_DIPHOSPHATE_METABOLIC_PROCESS Entpd3
# 3 122 0.15419642 1 -0.02241342 0.02241342 4.426756 GOBP_NUCLEOSIDE_DIPHOSPHATE_METABOLIC_PROCESS Dlg2
# colnames
colnames(gsdata1)
# [1] "x" "runningScore" "position" "ymin" "ymax" "geneList"
# [7] "Description" "gene_name"
6经典图形绘制
这里我们尝试一步步绘制上面常见的图形出来。
curve plot
# plot
pcurve <- ggplot(gsdata,aes(x = x,y = runningScore,color = runningScore)) +
geom_hline(yintercept = 0,size = 1,color = 'black',
lty = 'dashed') +
geom_line() +
# geom_segment(data = gsdata1,aes(xend = x,yend = 0)) +
theme_bw(base_size = 14) +
scale_color_gradient(low = '#76BA99',high = '#EB4747') +
scale_x_continuous(expand = c(0,0)) +
# scale_y_continuous(expand = c(0,0)) +
theme(legend.position = 'none',
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.line.x = element_blank(),
axis.title.x = element_blank(),
legend.background = element_rect(fill = "transparent"),
plot.margin = margin(t = .2,r = .2, b = 0,l = .2,unit = "cm")) +
ylab('Running Enrichment Score')
pcurve
segment plot
pseg <- ggplot(gsdata,aes(x = x,y = runningScore)) +
geom_segment(data = gsdata1,
aes(x = x,xend = x,y = 0,yend = 1),
color = 'black',show.legend = F) +
scale_x_continuous(expand = c(0,0)) +
scale_y_continuous(expand = c(0,0)) +
theme_bw(base_size = 14) +
theme(axis.ticks = element_blank(),
axis.text = element_blank(),
axis.title.y = element_blank(),
panel.grid = element_blank(),
axis.line.x = element_blank(),
plot.margin = margin(t = 0,r = .2,b = .2,l = .2,unit = "cm")) +
xlab('Rank in Ordered Dataset')
pseg
segment and heatmap
v <- seq(1, sum(gsdata$position), length.out = 9)
inv <- findInterval(rev(cumsum(gsdata$position)), v)
if (min(inv) == 0) inv <- inv + 1
col <- c(rev(brewer.pal(5, "Blues")), brewer.pal(5, "Reds"))
# ymin <- min(p2$data$ymin)
# yy <- max(p2$data$ymax - p2$data$ymin) * .3
ymin <- 0
yy <- 0.3
xmin <- which(!duplicated(inv))
xmax <- xmin + as.numeric(table(inv)[as.character(unique(inv))])
d <- data.frame(ymin = ymin, ymax = yy,
xmin = xmin,
xmax = xmax,
col = col[unique(inv)])
pseg_ht <- pseg + geom_rect(
aes_(xmin = ~xmin,xmax = ~xmax,
ymin = ~ymin,ymax = ~ymax,
fill = ~I(col)),
data = d,
alpha = 0.8,
inherit.aes = FALSE)
pseg_ht
gene rank plot
# add gene rank
pseg_ht1 <- pseg_ht + xlab('') +
theme(axis.title.x = element_blank(),
plot.margin = margin(t = -.1,r = .2,b = 0,l = .2,unit = "cm"))
prank <- ggplot(gsdata,aes(x = x,y = geneList)) +
geom_col(width = 1,fill = 'grey80',color = NA) +
# geom_col(aes(fill = geneList),
# width = 1,color = NA,show.legend = F) +
# scale_fill_gradient2(low = col[1],mid = 'white',high = col[length(col)],midpoint = 0) +
geom_hline(yintercept = 0,size = 0.8,color = 'black',
lty = 'dashed') +
theme_bw(base_size = 14) +
theme(panel.grid = element_blank(),
plot.margin = margin(t = -.1,r = .2,b = .2,l = .2,unit = "cm")) +
coord_cartesian(expand = 0) +
ylab('Ranked List') +
xlab('Rank in Ordered Dataset')
prank
拼图
# combine
pall <- aplot::plot_list(gglist = list(pcurve,pseg_ht1,prank),
ncol = 1, heights = c(0.5,0.2,0.3))
pall
最后我们就可以得到一张完整的 GSEA 常见的图形了。
7绘图拓展
有了以上的基础,我们后面可以做一些更多的事情,改改参数,加加代码啥的。
添加基因名
# add gene name
geneLabel <- gsdata1 %>% arrange(desc(runningScore)) %>%
head(20)
plabel <- pcurve +
geom_segment(data = geneLabel,aes(xend = x,yend = 0),
color = 'red') +
geom_text_repel(data = geneLabel,
aes(label = gene_name),
force = 20,
max.overlaps = 50,
# nudge_y = 0.2,
size = 4,
fontface = 'italic')
aplot::plot_list(gglist = list(plabel,pseg_ht1,prank),
ncol = 1, heights = c(0.5,0.2,0.3))
新的 style
panother <- ggplot(gsdata,aes(x = x,y = runningScore,color = runningScore)) +
geom_hline(yintercept = 0,size = 0.8,color = 'black',
lty = 'dashed') +
geom_point() +
geom_line() +
geom_segment(data = gsdata1,aes(xend = x,yend = 0)) +
theme_bw(base_size = 14) +
# scale_color_gradient(low = '#336699',high = '#993399') +
scale_color_gradient2(low = '#336699',mid = 'white',high = '#993399',midpoint = 0.2) +
scale_x_continuous(expand = c(0,0)) +
theme(legend.position = 'none',
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
axis.line.x = element_blank(),
axis.title.x = element_blank(),
legend.background = element_rect(fill = "transparent"),
plot.margin = margin(t = .2,r = .2, b = 0,l = .2,unit = "cm")) +
ylab('Running Enrichment Score')
panother
添加基因名
# add gene name
panother_label <-
panother +
geom_text_repel(data = geneLabel,
aes(label = gene_name),
force = 20,
max.overlaps = 50,
# nudge_y = 0.15,
size = 4,
# color = 'black',
fontface = 'italic')
panother_label
添加热图
# new color
color <- rev(colorRampPalette(c("#336699","white", "#993399"))(10))
ht <- ggplot(gsdata,aes(x = x,y = runningScore)) +
geom_rect(aes_(xmin = ~xmin,xmax = ~xmax,
ymin = ~ymin,ymax = ~ymax,
fill = ~I(color)),
data = d,
alpha = 0.8,
inherit.aes = FALSE) +
scale_x_continuous(expand = c(0,0)) +
scale_y_continuous(expand = c(0,0)) +
theme_bw(base_size = 14) +
theme(panel.grid = element_blank(),
axis.ticks = element_blank(),
axis.text = element_blank(),
axis.title = element_blank(),
plot.margin = margin(t = 0,r = .2, b = .2,l = .2,unit = "cm"))
# combine
aplot::plot_list(gglist = list(panother_label,ht),
ncol = 1, heights = c(0.9,0.1))
8结尾
大家可以自由发挥了。
欢迎加入生信交流群。加我微信我也拉你进 微信群聊 老俊俊生信交流群
(微信交流群需收取20元入群费用(防止骗子和便于管理)
)。
老俊俊微信:
知识星球:
所以今天你学习了吗?
今天的分享就到这里了,敬请期待下一篇!
最后欢迎大家分享转发,您的点赞是对我的鼓励和肯定!
如果觉得对您帮助很大,赏杯快乐水喝喝吧!
往期回顾
◀scRNAtoolVis 绘制单细胞 Marker 基因均值表达热图
◀...